import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn')
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings(action='ignore', category=DeprecationWarning)
warnings.filterwarnings(action='ignore', category=FutureWarning)
missing_values = ["?", ".", "", "_", "Na", "NULL", "null", "not", "Not", "NaN", "NA", "??", "nan", "inf"]
raw_data = pd.read_csv("Police_Department_Incident_Reports__2018_to_Present.csv", na_values=missing_values)
raw_data.set_index('Row ID', inplace=True)
raw_data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 410210 entries, 95308704134 to 95316305151 Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Incident Datetime 410210 non-null object 1 Incident Date 410210 non-null object 2 Incident Time 410210 non-null object 3 Incident Year 410210 non-null int64 4 Incident Day of Week 410210 non-null object 5 Report Datetime 410210 non-null object 6 Incident ID 410210 non-null int64 7 Incident Number 410210 non-null int64 8 CAD Number 318808 non-null float64 9 Report Type Code 410210 non-null object 10 Report Type Description 410210 non-null object 11 Filed Online 84136 non-null object 12 Incident Code 410210 non-null int64 13 Incident Category 409921 non-null object 14 Incident Subcategory 409921 non-null object 15 Incident Description 410210 non-null object 16 Resolution 410210 non-null object 17 Intersection 389130 non-null object 18 CNN 389130 non-null float64 19 Police District 410210 non-null object 20 Analysis Neighborhood 389043 non-null object 21 Supervisor District 389130 non-null float64 22 Latitude 389130 non-null float64 23 Longitude 389130 non-null float64 24 point 389130 non-null object dtypes: float64(5), int64(4), object(16) memory usage: 81.4+ MB
raw_data.head()
| Incident Datetime | Incident Date | Incident Time | Incident Year | Incident Day of Week | Report Datetime | Incident ID | Incident Number | CAD Number | Report Type Code | ... | Incident Description | Resolution | Intersection | CNN | Police District | Analysis Neighborhood | Supervisor District | Latitude | Longitude | point | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Row ID | |||||||||||||||||||||
| 95308704134 | 15/08/2020 12:43 | 15/08/2020 | 12:43 | 2020 | Saturday | 15/08/2020 12:58 | 953087 | 200490354 | 202281583.0 | II | ... | Battery | Open or Active | GENEVA AVE \ LONDON ST | 21475000.0 | Ingleside | Excelsior | 11.0 | 37.716039 | -122.440255 | (37.716038818883085, -122.44025513581519) |
| 64999771000 | 18/01/2018 19:00 | 18/01/2018 | 19:00 | 2018 | Thursday | 22/01/2018 16:59 | 649997 | 186068683 | NaN | II | ... | Lost Property | Open or Active | NaN | NaN | Out of SF | NaN | NaN | NaN | NaN | NaN |
| 95319604083 | 16/08/2020 3:13 | 16/08/2020 | 3:13 | 2020 | Sunday | 16/08/2020 3:14 | 953196 | 200491669 | 202290313.0 | II | ... | Firearm, Discharging in Grossly Negligent Manner | Open or Active | 23RD ST \ ARKANSAS ST | 23642000.0 | Bayview | Potrero Hill | 10.0 | 37.754827 | -122.397729 | (37.75482657770952, -122.39772873392515) |
| 95326228100 | 16/08/2020 3:38 | 16/08/2020 | 3:38 | 2020 | Sunday | 16/08/2020 4:56 | 953262 | 200491738 | 202290404.0 | II | ... | Malicious Mischief, Breaking Windows | Open or Active | VALENCIA ST \ 15TH ST | 24377000.0 | Mission | Mission | 9.0 | 37.766540 | -122.422044 | (37.76653957529556, -122.42204381448558) |
| 95322706244 | 15/08/2020 9:40 | 15/08/2020 | 9:40 | 2020 | Saturday | 15/08/2020 18:21 | 953227 | 206121692 | NaN | II | ... | Theft, From Locked Vehicle, >$950 | Open or Active | NaN | NaN | Park | NaN | NaN | NaN | NaN | NaN |
5 rows × 25 columns
raw_data.describe()
| Incident Year | Incident ID | Incident Number | CAD Number | Incident Code | CNN | Supervisor District | Latitude | Longitude | |
|---|---|---|---|---|---|---|---|---|---|
| count | 410210.000000 | 410210.000000 | 4.102100e+05 | 3.188080e+05 | 410210.000000 | 3.891300e+05 | 389130.000000 | 389130.000000 | 389130.000000 |
| mean | 2018.886119 | 802774.974791 | 1.904148e+08 | 1.912847e+08 | 25089.386536 | 2.532681e+07 | 5.965577 | 37.769380 | -122.423775 |
| std | 0.789394 | 104865.461463 | 8.831851e+06 | 1.896881e+07 | 25785.005878 | 3.085594e+06 | 2.788198 | 0.024062 | 0.026120 |
| min | 2018.000000 | 618687.000000 | 0.000000e+00 | 1.000000e+00 | 1000.000000 | 2.001300e+07 | 1.000000 | 37.707988 | -122.511295 |
| 25% | 2018.000000 | 712073.250000 | 1.808363e+08 | 1.825131e+08 | 6244.000000 | 2.397300e+07 | 3.000000 | 37.756167 | -122.434062 |
| 50% | 2019.000000 | 802931.500000 | 1.904324e+08 | 1.914336e+08 | 7046.000000 | 2.491800e+07 | 6.000000 | 37.775873 | -122.417707 |
| 75% | 2020.000000 | 893632.000000 | 2.000466e+08 | 2.002821e+08 | 61030.000000 | 2.642200e+07 | 8.000000 | 37.785829 | -122.407337 |
| max | 2020.000000 | 984190.000000 | 9.811720e+08 | 1.000000e+09 | 75030.000000 | 5.412200e+07 | 11.000000 | 37.829991 | -122.363743 |
data = raw_data.drop([
'Incident Date' , 'Incident Time' , 'Incident Year' ,
'Report Datetime' , 'Report Type Code' , 'Report Type Description',
'Incident ID' , 'Incident Number' , 'CAD Number' ,
'Filed Online' , 'Incident Code' , 'Incident Subcategory' ,
'Incident Description', 'CNN' , 'Analysis Neighborhood' ,
'Supervisor District' , 'point'
], axis=1)
data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 410210 entries, 95308704134 to 95316305151 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Incident Datetime 410210 non-null object 1 Incident Day of Week 410210 non-null object 2 Incident Category 409921 non-null object 3 Resolution 410210 non-null object 4 Intersection 389130 non-null object 5 Police District 410210 non-null object 6 Latitude 389130 non-null float64 7 Longitude 389130 non-null float64 dtypes: float64(2), object(6) memory usage: 28.2+ MB
data['Incident Datetime'] = pd.to_datetime(data['Incident Datetime'], format='%d/%m/%Y %H:%M')
data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 410210 entries, 95308704134 to 95316305151 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Incident Datetime 410210 non-null datetime64[ns] 1 Incident Day of Week 410210 non-null object 2 Incident Category 409921 non-null object 3 Resolution 410210 non-null object 4 Intersection 389130 non-null object 5 Police District 410210 non-null object 6 Latitude 389130 non-null float64 7 Longitude 389130 non-null float64 dtypes: datetime64[ns](1), float64(2), object(5) memory usage: 28.2+ MB
data.isnull().sum().sort_values(ascending=False)
Longitude 21080 Latitude 21080 Intersection 21080 Incident Category 289 Police District 0 Resolution 0 Incident Day of Week 0 Incident Datetime 0 dtype: int64
data.dropna(axis=0, inplace=True)
data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 388843 entries, 95308704134 to 95316305151 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Incident Datetime 388843 non-null datetime64[ns] 1 Incident Day of Week 388843 non-null object 2 Incident Category 388843 non-null object 3 Resolution 388843 non-null object 4 Intersection 388843 non-null object 5 Police District 388843 non-null object 6 Latitude 388843 non-null float64 7 Longitude 388843 non-null float64 dtypes: datetime64[ns](1), float64(2), object(5) memory usage: 26.7+ MB
data.duplicated().sum()
24536
data.drop_duplicates(inplace=True)
data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 364307 entries, 95308704134 to 95316305151 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Incident Datetime 364307 non-null datetime64[ns] 1 Incident Day of Week 364307 non-null object 2 Incident Category 364307 non-null object 3 Resolution 364307 non-null object 4 Intersection 364307 non-null object 5 Police District 364307 non-null object 6 Latitude 364307 non-null float64 7 Longitude 364307 non-null float64 dtypes: datetime64[ns](1), float64(2), object(5) memory usage: 25.0+ MB
data.rename(columns={
'Incident Datetime': 'Datetime',
'Incident Day of Week': 'Weekday',
'Incident Category': 'Category',
'Police District': 'District',
}, inplace=True)
data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 364307 entries, 95308704134 to 95316305151 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Datetime 364307 non-null datetime64[ns] 1 Weekday 364307 non-null object 2 Category 364307 non-null object 3 Resolution 364307 non-null object 4 Intersection 364307 non-null object 5 District 364307 non-null object 6 Latitude 364307 non-null float64 7 Longitude 364307 non-null float64 dtypes: datetime64[ns](1), float64(2), object(5) memory usage: 25.0+ MB
plt.figure(figsize=(10, 5))
plt.subplot(1,2,1)
plt.boxplot(data['Latitude'])
plt.title('Latitude', fontsize=15)
plt.subplot(1,2,2)
plt.boxplot(data['Longitude'])
plt.title('Longitude', fontsize=15)
plt.show()
import scipy
for col in ['Latitude', 'Longitude']:
prop = data[col]
IQR = scipy.stats.iqr(prop)
Q1 = np.percentile(prop, 25)
Q3 = np.percentile(prop, 75)
n_O_upper = data[prop > (Q3 + 1.5 * IQR)].shape[0]
n_O_lower = data[prop < (Q1 - 1.5 * IQR)].shape[0]
outliers_per = (n_O_upper + n_O_lower) / data.shape[0]
print('Outliers Percentage of', prop.name, ':', outliers_per)
Outliers Percentage of Latitude : 0.005981219136607312 Outliers Percentage of Longitude : 0.06798387074637599
=> Có thể không cần loại bỏ Outliers vì số lượng không đáng kể
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
holidays = calendar().holidays(start=data['Datetime'].min(), end=data['Datetime'].max())
data['Holiday'] = data['Datetime'].dt.date.astype('datetime64').isin(holidays)
data['Holiday'] = data['Holiday'].astype('int64')
data.head()
| Datetime | Weekday | Category | Resolution | Intersection | District | Latitude | Longitude | Holiday | |
|---|---|---|---|---|---|---|---|---|---|
| Row ID | |||||||||
| 95308704134 | 2020-08-15 12:43:00 | Saturday | Assault | Open or Active | GENEVA AVE \ LONDON ST | Ingleside | 37.716039 | -122.440255 | 0 |
| 95319604083 | 2020-08-16 03:13:00 | Sunday | Assault | Open or Active | 23RD ST \ ARKANSAS ST | Bayview | 37.754827 | -122.397729 | 0 |
| 95326228100 | 2020-08-16 03:38:00 | Sunday | Malicious Mischief | Open or Active | VALENCIA ST \ 15TH ST | Mission | 37.766540 | -122.422044 | 0 |
| 95336264020 | 2020-08-16 13:40:00 | Sunday | Non-Criminal | Open or Active | 04TH ST \ MINNA ST | Southern | 37.784044 | -122.403712 | 0 |
| 95335012010 | 2020-08-16 16:18:00 | Sunday | Weapons Offense | Cite or Arrest Adult | ORTEGA ST \ 48TH AVE | Taraval | 37.751003 | -122.507416 | 0 |
len(data[data['Holiday'] == 1])
9530
from datetime import datetime, time
def time_in_range(start, end, x):
if start <= end: return start <= x <= end
return start <= x or x <= end
def map_business_hours(date):
time_parsed = date.time() # Convert military time to AM & PM
business_start = time(8, 0, 0)
business_end = time(18, 0, 0)
return time_in_range(business_start, business_end, time_parsed)
data['BusinessHour'] = data['Datetime'].map(map_business_hours).astype('int64')
data.head()
| Datetime | Weekday | Category | Resolution | Intersection | District | Latitude | Longitude | Holiday | BusinessHour | |
|---|---|---|---|---|---|---|---|---|---|---|
| Row ID | ||||||||||
| 95308704134 | 2020-08-15 12:43:00 | Saturday | Assault | Open or Active | GENEVA AVE \ LONDON ST | Ingleside | 37.716039 | -122.440255 | 0 | 1 |
| 95319604083 | 2020-08-16 03:13:00 | Sunday | Assault | Open or Active | 23RD ST \ ARKANSAS ST | Bayview | 37.754827 | -122.397729 | 0 | 0 |
| 95326228100 | 2020-08-16 03:38:00 | Sunday | Malicious Mischief | Open or Active | VALENCIA ST \ 15TH ST | Mission | 37.766540 | -122.422044 | 0 | 0 |
| 95336264020 | 2020-08-16 13:40:00 | Sunday | Non-Criminal | Open or Active | 04TH ST \ MINNA ST | Southern | 37.784044 | -122.403712 | 0 | 1 |
| 95335012010 | 2020-08-16 16:18:00 | Sunday | Weapons Offense | Cite or Arrest Adult | ORTEGA ST \ 48TH AVE | Taraval | 37.751003 | -122.507416 | 0 | 1 |
data['BusinessHour'].value_counts()
1 194717 0 169590 Name: BusinessHour, dtype: int64
def daypart(hour):
if hour in ['23','00','01','02']: return 'Midnight'
elif hour in ['03','04','05','06']: return 'Early Morning'
elif hour in ['07','08','09','10']: return 'Morning'
elif hour in ['11','12','13','14']: return 'Noon'
elif hour in ['15','16','17','18']: return 'Evening'
return 'Night'
data['Year'] = data['Datetime'].dt.year
data['Month'] = data['Datetime'].dt.month
data['Day'] = data['Datetime'].dt.day
data['Hour'] = data['Datetime'].dt.hour
data['Minute'] = data['Datetime'].dt.minute
data['Daypart'] = data['Datetime'].dt.time.apply(lambda x: daypart(str(x).split(':')[0]))
data['Weekend'] = data['Weekday'].apply(lambda x: 1 if x == 'Saturday' or x == 'Sunday' else 0)
data.head()
| Datetime | Weekday | Category | Resolution | Intersection | District | Latitude | Longitude | Holiday | BusinessHour | Year | Month | Day | Hour | Minute | Daypart | Weekend | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Row ID | |||||||||||||||||
| 95308704134 | 2020-08-15 12:43:00 | Saturday | Assault | Open or Active | GENEVA AVE \ LONDON ST | Ingleside | 37.716039 | -122.440255 | 0 | 1 | 2020 | 8 | 15 | 12 | 43 | Noon | 1 |
| 95319604083 | 2020-08-16 03:13:00 | Sunday | Assault | Open or Active | 23RD ST \ ARKANSAS ST | Bayview | 37.754827 | -122.397729 | 0 | 0 | 2020 | 8 | 16 | 3 | 13 | Early Morning | 1 |
| 95326228100 | 2020-08-16 03:38:00 | Sunday | Malicious Mischief | Open or Active | VALENCIA ST \ 15TH ST | Mission | 37.766540 | -122.422044 | 0 | 0 | 2020 | 8 | 16 | 3 | 38 | Early Morning | 1 |
| 95336264020 | 2020-08-16 13:40:00 | Sunday | Non-Criminal | Open or Active | 04TH ST \ MINNA ST | Southern | 37.784044 | -122.403712 | 0 | 1 | 2020 | 8 | 16 | 13 | 40 | Noon | 1 |
| 95335012010 | 2020-08-16 16:18:00 | Sunday | Weapons Offense | Cite or Arrest Adult | ORTEGA ST \ 48TH AVE | Taraval | 37.751003 | -122.507416 | 0 | 1 | 2020 | 8 | 16 | 16 | 18 | Evening | 1 |
plt.figure(figsize=(10, 7))
top10_category = data['Category'].value_counts()[:10]
ax = sns.countplot(
y = 'Category',
data = data,
order = top10_category.index
)
for rect in ax.patches:
ax.text(
rect.get_width() - 8000,
rect.get_y() + rect.get_height() / 2,
rect.get_width(),
color = 'white',
weight = 'bold',
fontsize = 11
)
plt.title('Top 10 loại tội phạm ở San Francisco', fontsize=16)
plt.show()
=> Tội phạm thuộc loại Larceny Theft, cao hơn đáng kể so với bất kỳ loại tội phạm nào khác
from matplotlib.animation import FuncAnimation
from IPython.display import HTML
year_counts = data.groupby(['Year', 'Category']).count().reset_index().pivot(
index = 'Year',
columns = 'Category',
values = 'Daypart'
)
year_counts.fillna(0, inplace=True)
def style_axes(ax):
ax.tick_params(labelsize=5, length=0)
ax.grid(True, axis='x', color='white')
ax.set_axisbelow(True)
[spine.set_visible(False) for spine in ax.spines.values()]
def prepare_data(df, steps=20):
df = df.reset_index()
df.index = df.index * steps
df_expanded = df.reindex(range(df.index[-1] + 1))
df_expanded['Year'] = df_expanded['Year'].fillna(method='ffill')
df_expanded = df_expanded.set_index('Year')
df_rank_expanded = df_expanded.rank(axis=1, method='first')
df_expanded = df_expanded.interpolate()
df_rank_expanded = df_rank_expanded.interpolate()
return df_expanded, df_rank_expanded
def init():
ax.clear()
style_axes(ax)
def update(i):
ax.clear()
for bar in ax.containers: bar.remove()
y = df_rank_expanded.iloc[i]
ax.barh(
y = y,
width = df_expanded.iloc[i],
color = plt.cm.Dark2(range(6)),
tick_label = df_expanded.columns
)
for rect in ax.patches:
ax.text(
rect.get_width() + 500,
rect.get_y() + rect.get_height() / 4,
int(rect.get_width()),
color = 'blue',
fontsize = 5.5
)
ax.set_ylim(min(y) - 1, max(y) + 1)
ax.set_title(
f'Tội phạm ở San Francisco - Năm {int(df_expanded.index[i])}',
fontsize=10
)
fig = plt.Figure(figsize=(6.8, 6), dpi=144)
ax = fig.add_subplot()
df_expanded, df_rank_expanded = prepare_data(year_counts)
animation = FuncAnimation(
fig = fig,
func = update,
init_func = init,
frames = len(df_expanded),
interval = 100,
repeat = False
)
HTML(animation.to_jshtml())
color = plt.cm.winter(np.linspace(0, 10, 20))
data['Resolution'].value_counts().plot.bar(color=color)
plt.xticks(rotation=0, fontsize=11)
plt.title('Các giải pháp cho tội phạm',fontsize=16)
plt.show()
=> Hầu hết các sự cố đều đang được tiến hành xử lý
most_commons = data[data['Category'].isin(top10_category.index)]
violent = most_commons.copy()
violent['Arrest'] = np.where(violent['Resolution'].isin(['Cite or Arrest Adult', 'Unfounded']), 0, 1)
arrest_counts = violent['Category'][violent.Arrest == 1].value_counts()[:10]
total_counts = violent['Category'].value_counts()[:10]
arrest_counts = arrest_counts / (total_counts).sort_index()
total_counts = total_counts / (total_counts).sort_index()
total_counts.plot.barh(color='crimson', label= 'Unsolved')
arrest_counts.plot.barh(color='mediumseagreen', label='Solved')
plt.legend(bbox_to_anchor=(1.05, 1))
plt.show()
plt.figure(figsize = (12, 10))
arrest = data[data['Resolution'].isin(['Cite or Arrest Adult', 'Cite or Arrest Juvenile'])]
a = arrest['Category'].value_counts().reset_index()
b = data['Category'].value_counts().reset_index()
a = a.merge(b, how='inner', on='index')
a.columns = ['Category', 'Arrests', 'Cases']
a['Arrests Percent'] = round(100 * (a['Arrests'] / a['Cases']), 2)
a.sort_values('Arrests Percent', ascending = False, inplace=True)
sns.barplot(x='Arrests Percent', y='Category', data=a)
plt.xlabel('')
plt.ylabel('')
plt.title('Điều gì sẽ khiến ta bị bắt ở San Francisco', fontsize=16)
plt.show()
=> Ma túy, vi phạm giao thông và lệnh bắt giữ là 3 lý do hàng đầu cho các vụ bắt giữ
plt.figure(figsize=(10, 7))
color = plt.cm.spring(np.linspace(0, 1, 12))
ax = data['District'].value_counts().plot.bar(color=color)
for p in ax.patches:
height = p.get_height()
ax.text(
x = p.get_x() + p.get_width() / 2,
y = height + 1000,
s = height,
ha = 'center'
)
plt.xticks(rotation=0, fontsize=11)
plt.title('Quận có nhiều tội phạm nhất', fontsize=16)
plt.show()
=> Quận Central là nơi ghi nhận nhiều tội phạm nhất và quận Park là nơi ít tội phạm nhất ở San Francisco
df = pd.crosstab(data['Category'], data['District'])
color = plt.cm.Greys(np.linspace(0, 1, 10))
df.div(df.sum(1).astype(float), axis=0).plot.bar(stacked=True, color=color, figsize=(15, 7))
plt.legend(loc='upper left', bbox_to_anchor=(1, 0, 0.5, 1), fontsize=12)
plt.xlabel('')
plt.title('Quận vs Loại tội phạm', fontsize=16)
plt.show()
plt.figure(figsize = (12, 5))
a = arrest['District'].value_counts().reset_index()
b = data['District'].value_counts().reset_index()
a = a.merge(b,how = 'inner', on = 'index')
a.columns = ['District', 'Arrests','Cases']
a['Arrests Percent'] = round(100 * a['Arrests'] / a['Cases'], 2)
a.sort_values('Arrests Percent', ascending = False, inplace = True)
sns.barplot(
x = 'District',
y = 'Arrests Percent',
data = a,
palette = ['red'] + ['grey'] * 8 + ['blue'] + ['grey']
)
for i in range(10):
plt.text(
x = i,
y = a['Arrests Percent'] .iloc[i],
s = f"{a['Arrests Percent'].iloc[i]}%",
horizontalalignment = 'center',
verticalalignment = 'bottom',
weight = 'bold'
)
plt.xticks(fontsize=11)
plt.xlabel('')
plt.ylabel('')
plt.title('Nơi mà cảnh sát nghiêm khắc nhất - Phần trăm bị bắt', fontsize=16)
plt.show()
=> Mặc dù có ít tội phạm hơn được ghi nhận ở quận Tenderlion nhưng tỷ lệ bắt giữ ở đây khá cao 36,09%
del df
df = data.copy()
df = df.groupby(df['Datetime'].dt.date).count().iloc[:, 0]
color_palette = sns.color_palette()
sns.kdeplot(data=df, shade=True)
plt.axvline(
x = df.median(),
ymax = 0.95,
linestyle = '--',
color = color_palette[1]
)
plt.annotate(
f'Median: {df.median()}',
xy = (df.median(), 0.004),
xytext = (200, 0.005),
arrowprops = dict(
arrowstyle = '->',
color = color_palette[1],
shrinkB = 10
)
)
plt.xlabel('Incidents')
plt.ylabel('Density')
plt.title('Phân phối số lượng sự cố mỗi ngày', fontdict={'fontsize': 16})
plt.show()
cat_per_week_common = pd.crosstab(most_commons['Category'], most_commons['Weekday'])
cat_per_week_common = cat_per_week_common.div(cat_per_week_common.sum(axis=1), axis=0)
cat_per_week_common = cat_per_week_common[['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday','Sunday']]
ax = sns.heatmap(cat_per_week_common, cmap="BuPu", linewidths=0.5)
plt.xticks(fontsize=11, rotation=45, ha='right')
plt.yticks(fontsize=11)
plt.xlabel('')
plt.ylabel('')
plt.show()
data['Weekday'].value_counts().plot.pie(
explode = [0.1, 0, 0, 0, 0, 0, 0],
textprops = {'color': 'w', 'fontsize': 15, 'weight': 'bold'},
autopct = "%.1f%%"
)
plt.legend(loc='center left', bbox_to_anchor=(1, 0, 0.5, 1), fontsize=12)
plt.axis('off')
plt.title('Số lượng tội phạm theo thứ', fontsize=16)
plt.show()
=> Thứ 6 là ngày mà tội phạm được ghi nhận nhiều nhất, tiếp theo là Thứ 4. Chủ nhật là ngày ít tội phạm nhất
del df
df = data.copy()
df['Date'] = df['Datetime'].dt.date
df = df.groupby(['Hour', 'Date', 'Category'], as_index=False).count().iloc[:, :4]
df.rename(columns={'Datetime': 'Incidents'}, inplace=True)
df = df.groupby(['Hour', 'Category'], as_index=False).mean()
df = df[df['Category'].isin(top10_category[4:].index)]
fig, ax = plt.subplots(figsize=(12, 5))
ax = sns.lineplot(x='Hour', y='Incidents', data=df, hue='Category')
ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.15), ncol=6)
fig.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.suptitle('Số sự cố trung bình mỗi giờ')
plt.show()
plt.figure(figsize=(15, 7))
color = plt.cm.twilight(np.linspace(0, 4, 100))
data['Datetime'].dt.time.value_counts().head(24).sort_index().plot.bar(color=color)
plt.xticks(rotation=45, fontsize=11)
plt.title('Phân bố tội phạm trong ngày', fontsize=16)
plt.show()
df = data['Daypart'].value_counts().reset_index()
df_normalize = data['Daypart'].value_counts(normalize=True)
sns.barplot(x=df['index'], y='Daypart', data=df)
for i in range(6):
plt.text(
x = i,
y = 5000,
s = f'{round(100 * df_normalize[i], 2)}%',
horizontalalignment = 'center',
color = 'white',
weight = 'bold'
)
plt.title('Số lượng tội phạm theo các buổi trong ngày', fontsize=16)
plt.show()
=> Hãy cẩn thận vào buổi tối. Gần 1/4 số vụ tội phạm xảy ra vào buổi tối
plt.figure(figsize = (15, 7))
pivot_table = pd.pivot_table(
columns = data['Daypart'] ,
index = 'Weekday',
values = 'Daypart' ,
aggfunc = 'count',
data = data
)
pivot_table = pivot_table.reindex(
index = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
columns = ['Early Morning', 'Morning', 'Noon', 'Evening', 'Night', 'Midnight']
)
sns.heatmap(pivot_table, cmap='Reds', annot=True, fmt='d')
plt.xlabel('')
plt.ylabel('')
plt.show()
=> Buổi tối thường là lúc có nhiều vụ phạm tội, tối Thứ 6 đặc biệt tồi tệ và tối Chủ nhật tương đối tốt hơn
sns.countplot(data['Month'], palette='autumn')
plt.xticks(fontsize=11)
plt.xlabel('')
plt.title('Tội phạm theo từng tháng', fontsize=16)
plt.show()
=> Tháng 1 là tháng có nhiều tội phạm nhất
plt.figure(figsize = (15, 7))
pivot_table = pd.pivot_table(
columns = data['Day'],
values = 'Day',
index = 'Month',
aggfunc = 'count',
data = data
)
sns.heatmap(pivot_table, cmap='Reds')
plt.xticks(fontsize=12)
plt.yticks(
np.arange(0.5, 12.5),
['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sept', 'Oct', 'Nov', 'Dec'],
fontsize = 12
)
plt.xlabel('')
plt.ylabel('')
plt.show()
=> Ngày 1/1 dường như là ngày có số tội phạm được ghi nhận cao nhất và ngày 25/12 là ngày có số tội phạm được ghi nhận ít nhất
first_date = data['Datetime'].min()
last_date = data['Datetime'].max()
print('Ngày đầu tiên:', first_date)
print('Ngày cuối cùng:', last_date)
print(f'Số ngày tổng cộng: {(last_date - first_date).days + 1}')
Ngày đầu tiên: 2018-01-01 00:00:00 Ngày cuối cùng: 2020-12-04 22:19:00 Số ngày tổng cộng: 1069
time_span = pd.date_range(first_date, last_date)
def to_time_series(df):
date = df['Datetime'].value_counts().index
ts = pd.DataFrame(
data = df['Datetime'].value_counts().values[np.argsort(date)],
index = sorted(date),
columns = ['count']
)
return ts.reindex(time_span, fill_value=0)
all_counts = to_time_series(data)
all_counts[all_counts['count'] == 0]
| count |
|---|
=> Không có ngày nào là bình yên (không có bất kỳ sự cố nào xảy ra)
plt.figure(figsize=(16, 7))
all_counts['count'].plot()
plt.xlabel('Thới gian')
plt.ylabel('Số lượng tội phạm')
plt.show()
=> Liệu các thay đổi này có đúng cho các loại tội phạm riêng lẻ ? Ta sẽ thử với Larceny Theft
df_theft = data[data['Category'] == 'Larceny Theft']
theft_counts = to_time_series(df_theft)
theft_counts[theft_counts['count'] == 0]
| count | |
|---|---|
| 2018-01-03 | 0 |
| 2018-01-11 | 0 |
| 2018-02-05 | 0 |
| 2018-02-08 | 0 |
| 2018-02-22 | 0 |
| ... | ... |
| 2020-10-24 | 0 |
| 2020-10-28 | 0 |
| 2020-11-10 | 0 |
| 2020-11-28 | 0 |
| 2020-11-29 | 0 |
65 rows × 1 columns
plt.figure(figsize=(16, 7))
theft_counts['count'].plot()
plt.xlabel('Thới gian')
plt.ylabel('Số lượng tội phạm Larceny Theft')
plt.show()
=> Nó thực sự không khác nhau nhiều
y = theft_counts.resample('MS').sum()
y.tail()
| count | |
|---|---|
| 2020-08-01 | 78 |
| 2020-09-01 | 77 |
| 2020-10-01 | 72 |
| 2020-11-01 | 57 |
| 2020-12-01 | 5 |
Do dữ liệu chỉ được ghi cho đến ngày 4 tháng 12 năm 2020, nên ta sẽ không dùng dữ liệu của tháng 12 năm 2020 trở đi
fig = plt.figure(figsize=(20,6))
y = y[:-1]
y['count'].plot()
plt.xlabel('Thời gian')
plt.ylabel('Số lượng Larceny Theft mỗi tháng')
plt.show()
=> Ta có thể thấy rằng có sự giảm mạnh từ tháng 7 có thể là do ảnh hưởng của Covid-19
Ở đây, những danh sách tội phạm này có thể được khám phá và mô hình hóa thêm nếu có thêm dữ kiện của nhiều năm trước:
data_with_description = raw_data[raw_data.index.isin(data.index)]
data_with_description.rename(columns={
'Incident Datetime': 'Datetime',
'Incident Day of Week': 'Weekday',
'Incident Category': 'Category',
'Incident Description': 'Description',
'Police District': 'District',
}, inplace=True)
from wordcloud import WordCloud
description = data_with_description['Description']
wc = WordCloud(background_color='#e9eaf1', width=1000, height=500).generate(str(description))
plt.imshow(wc)
plt.axis('off')
plt.title('Mô tả cho các tội', fontsize=20)
plt.show()
import plotly.express as px
discat = data_with_description.groupby(['District', 'Category'])['Description'].count().reset_index()
fig = px.sunburst(
discat.rename(columns={'Description': 'Count'}),
path = ['District', 'Category'],
values = 'Count',
color = 'Count'
)
fig.show()
plt.figure(figsize=(10, 7))
ax = sns.countplot(
y = 'Intersection',
data = data,
order = data['Intersection'].value_counts()[:15].index
)
for rect in ax.patches:
ax.text(
rect.get_width() - 150,
rect.get_y() + rect.get_height() / 2,
rect.get_width(),
color = 'white',
weight = 'bold',
fontsize = 11
)
plt.title('Top 15 vùng phạm tội', fontsize=16)
plt.show()
del df
df = data.copy()
mean_lat = np.mean(df.Latitude)
mean_lon = np.mean(df.Longitude)
sw = df[['Latitude', 'Longitude']].min().values.tolist()
ne = df[['Latitude', 'Longitude']].max().values.tolist()
df['Coordinates'] = df['Longitude'].astype(str) + ', ' + df['Latitude'].astype(str)
df.Coordinates.value_counts()[:5]
-122.40733700000001, 37.78456014 2490 -122.4036355, 37.77516081 2298 -122.4080362, 37.78640961 2160 -122.419669, 37.76505134 1681 -122.41259529999999, 37.78393258 1504 Name: Coordinates, dtype: int64
=> Cặp tọa độ phổ biến nhất (-122.40733700000001, 37.78456014) là vị trí của Market St
plt.title('Phân bố tội phạm')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.scatter(data.Longitude, data.Latitude, s=0.5, c='r')
plt.show()
=> Ta có thể thấy có nhiều tội phạm hơn ở phần đông bắc của San Francisco. Câu hỏi đặt ra rằng liệu điều này có luôn như vậy trong suốt các năm qua không ?
Ta sẽ xem liệu phân phối có bất kỳ thay đổi đáng kể nào không:
plt.figure(figsize = (16, 5))
for index, year in enumerate(range(2018, 2021)):
df_jan = data[(data['Datetime'] < f'{year}-02') & (data['Datetime'] > f'{year - 1}-12-31')]
plt.subplot(1, 3, index + 1)
plt.scatter(df_jan.Longitude, df_jan.Latitude, s=0.5, c='r')
sns.kdeplot(df_jan.Longitude, df_jan.Latitude)
plt.xticks(rotation=45)
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title(f'Phân bố tội phạm vào tháng 1 năm {year}')
plt.show()
=> Có vẻ như phần đông bắc của San Francisco luôn là khu vực nguy hiểm nhất
import geopandas as gpd
from shapely.geometry import Point
gdf = data.copy()
gdf['Coordinates'] = list(zip(gdf.Longitude, gdf.Latitude))
gdf.Coordinates = gdf.Coordinates.apply(Point)
gdf = gpd.GeoDataFrame(gdf, geometry='Coordinates', crs={'init': 'epsg:4326'})
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
ax = world.plot(color='white', edgecolor='black')
gdf.plot(ax=ax, color='red')
plt.show()
PROJ: proj_create_from_database: SQLite error on SELECT name, type, coordinate_system_auth_name, coordinate_system_code, datum_auth_name, datum_code, area_of_use_auth_name, area_of_use_code, text_definition, deprecated FROM geodetic_crs WHERE auth_name = ? AND code = ?: no such column: area_of_use_auth_name
district_labels = {
'Southern': '37.774432, -122.401121',
'Bayview': '37.734332, -122.389920',
'Mission': '37.756478, -122.423663',
'Northern': '37.787740, -122.430300',
'Tenderloin': '37.781980, -122.412981',
'Central': '37.796200, -122.409293',
'Park': '37.765352, -122.449282',
'Richmond': '37.776204, -122.483285',
'Ingleside': '37.726817, -122.437207',
'Taraval': '37.737775, -122.484375'
}
table = data['District'].value_counts().reindex(district_labels.keys())
table = table.reset_index().rename(
{'index': 'District', 'District': 'Count'},
axis='columns'
)
table['District'] = table['District'].str.upper()
table
| District | Count | |
|---|---|---|
| 0 | SOUTHERN | 44896 |
| 1 | BAYVIEW | 32120 |
| 2 | MISSION | 48846 |
| 3 | NORTHERN | 50818 |
| 4 | TENDERLOIN | 35127 |
| 5 | CENTRAL | 55065 |
| 6 | PARK | 17638 |
| 7 | RICHMOND | 21415 |
| 8 | INGLESIDE | 28413 |
| 9 | TARAVAL | 25815 |
import urllib.request
import shutil
import zipfile
url = 'https://data.sfgov.org/api/geospatial/wkhw-cjsf?method=export&format=Shapefile'
with urllib.request.urlopen(url) as response, open('pd_data.zip', 'wb') as out_file:
shutil.copyfileobj(response, out_file)
with zipfile.ZipFile('pd_data.zip', 'r') as zip_ref:
zip_ref.extractall('pd_data')
import os
import re
for filename in os.listdir('./pd_data/'):
if re.match(".+\.shp", filename):
districts = gpd.read_file(f'./pd_data/{filename}')
break
districts.crs={'init': 'epsg:3857'}
districts = districts.merge(
table.set_index(['District']),
how = 'inner',
left_on = 'district',
right_index = True,
suffixes = ('_x', '_y')
)
districts
| company | district | shape_area | shape_le_1 | shape_leng | geometry | Count | |
|---|---|---|---|---|---|---|---|
| 0 | B | SOUTHERN | 9.134414e+07 | 100231.353916 | 87550.275142 | MULTIPOLYGON (((-122.39186 37.79425, -122.3917... | 44896 |
| 1 | C | BAYVIEW | 2.013846e+08 | 144143.480351 | 163013.798332 | POLYGON ((-122.38098 37.76480, -122.38103 37.7... | 32120 |
| 2 | D | MISSION | 8.062384e+07 | 40518.834235 | 40152.783389 | POLYGON ((-122.40954 37.76932, -122.40862 37.7... | 48846 |
| 3 | E | NORTHERN | 8.278169e+07 | 50608.310321 | 56493.858208 | POLYGON ((-122.43379 37.80793, -122.43375 37.8... | 50818 |
| 4 | J | TENDERLOIN | 1.107215e+07 | 18796.784185 | 12424.268969 | POLYGON ((-122.40217 37.78626, -122.41718 37.7... | 35127 |
| 5 | A | CENTRAL | 5.595027e+07 | 67686.522865 | 64025.129073 | POLYGON ((-122.42612 37.80684, -122.42612 37.8... | 55065 |
| 6 | F | PARK | 8.487896e+07 | 50328.913294 | 46307.776968 | POLYGON ((-122.43956 37.78314, -122.43832 37.7... | 17638 |
| 7 | G | RICHMOND | 1.379640e+08 | 75188.628361 | 69991.465355 | POLYGON ((-122.44127 37.79149, -122.44060 37.7... | 21415 |
| 8 | H | INGLESIDE | 1.935805e+08 | 74474.181164 | 74737.936295 | POLYGON ((-122.40450 37.74858, -122.40407 37.7... | 28413 |
| 9 | I | TARAVAL | 2.846767e+08 | 73470.424000 | 75350.217521 | POLYGON ((-122.49842 37.70810, -122.49842 37.7... | 25815 |
import contextily as ctx
districts['incidents_per_day'] = districts.Count / data.groupby('Datetime').count().shape[0]
fig, ax = plt.subplots(figsize=(12, 7))
districts.plot(
column = 'incidents_per_day',
cmap = 'Reds',
alpha = 0.6,
edgecolor = 'r',
linestyle = '-',
linewidth = 1,
legend = True,
ax = ax)
for index in districts.index:
name = districts.loc[index].district
geometry = districts.loc[index].geometry
plt.annotate(
name,
(geometry.centroid.x, geometry.centroid.y),
color = '#353535',
fontsize = 'large',
fontweight = 'heavy',
horizontalalignment = 'center'
)
plt.show()
import geoplot as gplt
crimes = data['Category'].unique().tolist()
sf_land = districts.unary_union
sf_land = gpd.GeoDataFrame(gpd.GeoSeries(sf_land), crs={'init':'epsg:4326'})
sf_land = sf_land.rename(columns={0: 'geometry'}).set_geometry('geometry')
fig, ax = plt.subplots(2, 3, sharex=True, sharey=True, figsize=(15, 7))
for i, crime in enumerate(np.random.choice(crimes, size=6, replace=False)):
ax = fig.add_subplot(2, 3, i + 1)
gplt.kdeplot(
gdf.loc[gdf['Category'] == crime],
shade = True,
shade_lowest = False,
clip = sf_land.geometry,
cmap = 'Reds',
ax = ax)
gplt.polyplot(sf_land, ax=ax)
ax.set_title(crime)
fig.tight_layout(rect=[0, 0, 0, 0])
ax.set_axis_off()
plt.title('Mật độ địa lý của các tội phạm khác nhau')
plt.show()
import folium
gjson = r'https://cocl.us/sanfran_geojson'
figure = folium.Figure(width=900, height=600)
sf_map = folium.Map(location=[37.77, -122.42], zoom_start=12)
sf_map.choropleth(
geo_data = gjson,
data = table,
columns = ['District', 'Count'],
key_on = 'feature.properties.DISTRICT',
fill_color = 'YlOrRd',
fill_opacity = 0.7,
line_opacity = 0.2,
legend_name = 'Crimes in San Francisco'
)
figure.add_child(sf_map)
figure
%%html
<iframe
src="https://data.sfgov.org/dataset/Map-of-Police-Department-Incident-Reports-2018-to-/jq29-s5wp/embed?width=950&height=600"
width="950"
height="600"
style="border:0; padding: 0; margin: 0;"
></iframe>
data_new = data.drop(['Datetime', 'Resolution', 'Intersection'], axis=1).copy()
data_new.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 364307 entries, 95308704134 to 95316305151 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Weekday 364307 non-null object 1 Category 364307 non-null object 2 District 364307 non-null object 3 Latitude 364307 non-null float64 4 Longitude 364307 non-null float64 5 Holiday 364307 non-null int64 6 BusinessHour 364307 non-null int64 7 Year 364307 non-null int64 8 Month 364307 non-null int64 9 Day 364307 non-null int64 10 Hour 364307 non-null int64 11 Minute 364307 non-null int64 12 Daypart 364307 non-null object 13 Weekend 364307 non-null int64 dtypes: float64(2), int64(8), object(4) memory usage: 51.7+ MB
# data_new['Weekday'] = data_new['Weekday'].astype('category')
# data_new['Weekday'] = data_new['Weekday'].cat.codes
# data_new['Daypart'] = data_new['Daypart'].astype('category')
# data_new['Daypart'] = data_new['Daypart'].cat.codes
from sklearn.preprocessing import LabelEncoder
encoded_cols = ['Weekday', 'Daypart'] # Do dữ liệu có tính thứ tự => LabelEncoder
data_new[encoded_cols] = data_new[encoded_cols].apply(LabelEncoder().fit_transform)
data_new.head()
| Weekday | Category | District | Latitude | Longitude | Holiday | BusinessHour | Year | Month | Day | Hour | Minute | Daypart | Weekend | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Row ID | ||||||||||||||
| 95308704134 | 2 | Assault | Ingleside | 37.716039 | -122.440255 | 0 | 1 | 2020 | 8 | 15 | 12 | 43 | 5 | 1 |
| 95319604083 | 3 | Assault | Bayview | 37.754827 | -122.397729 | 0 | 0 | 2020 | 8 | 16 | 3 | 13 | 0 | 1 |
| 95326228100 | 3 | Malicious Mischief | Mission | 37.766540 | -122.422044 | 0 | 0 | 2020 | 8 | 16 | 3 | 38 | 0 | 1 |
| 95336264020 | 3 | Non-Criminal | Southern | 37.784044 | -122.403712 | 0 | 1 | 2020 | 8 | 16 | 13 | 40 | 5 | 1 |
| 95335012010 | 3 | Weapons Offense | Taraval | 37.751003 | -122.507416 | 0 | 1 | 2020 | 8 | 16 | 16 | 18 | 1 | 1 |
data_new = pd.concat([data_new, pd.get_dummies(data_new['District'])], axis=1)
data_new.drop('District', axis=1, inplace=True)
data_new.head()
| Weekday | Category | Latitude | Longitude | Holiday | BusinessHour | Year | Month | Day | Hour | ... | Central | Ingleside | Mission | Northern | Out of SF | Park | Richmond | Southern | Taraval | Tenderloin | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Row ID | |||||||||||||||||||||
| 95308704134 | 2 | Assault | 37.716039 | -122.440255 | 0 | 1 | 2020 | 8 | 15 | 12 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 95319604083 | 3 | Assault | 37.754827 | -122.397729 | 0 | 0 | 2020 | 8 | 16 | 3 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 95326228100 | 3 | Malicious Mischief | 37.766540 | -122.422044 | 0 | 0 | 2020 | 8 | 16 | 3 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 95336264020 | 3 | Non-Criminal | 37.784044 | -122.403712 | 0 | 1 | 2020 | 8 | 16 | 13 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 95335012010 | 3 | Weapons Offense | 37.751003 | -122.507416 | 0 | 1 | 2020 | 8 | 16 | 16 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
5 rows × 24 columns
category_counts = data_new['Category'].value_counts()
category_counts
Larceny Theft 108427 Other Miscellaneous 27001 Non-Criminal 24135 Malicious Mischief 24040 Assault 22233 Burglary 19818 Motor Vehicle Theft 16747 Warrant 12760 Fraud 11037 Lost Property 10628 Recovered Vehicle 9849 Robbery 9008 Missing Person 8356 Suspicious Occ 7729 Offences Against The Family And Children 7184 Disorderly Conduct 6716 Drug Offense 6372 Traffic Violation Arrest 4845 Miscellaneous Investigation 3695 Other 3023 Other Offenses 2902 Weapons Offense 2200 Stolen Property 2008 Case Closure 1641 Forgery And Counterfeiting 1592 Weapons Carrying Etc 1454 Courtesy Report 1145 Sex Offense 1133 Arson 1038 Civil Sidewalks 820 Traffic Collision 769 Vandalism 667 Prostitution 586 Embezzlement 550 Family Offense 517 Fire Report 470 Vehicle Impounded 275 Vehicle Misplaced 174 Suicide 171 Drug Violation 120 Rape 98 Human Trafficking (A), Commercial Sex Acts 85 Suspicious 71 Liquor Laws 69 Motor Vehicle Theft? 51 Homicide 47 Gambling 19 Human Trafficking, Commercial Sex Acts 16 Weapons Offence 15 Human Trafficking (B), Involuntary Servitude 1 Name: Category, dtype: int64
mask = data_new['Category'].isin(category_counts[category_counts < 1000].index)
data_new['Category'][mask] = 'Other'
data_new['Category'].value_counts()
Larceny Theft 108427 Other Miscellaneous 27001 Non-Criminal 24135 Malicious Mischief 24040 Assault 22233 Burglary 19818 Motor Vehicle Theft 16747 Warrant 12760 Fraud 11037 Lost Property 10628 Recovered Vehicle 9849 Robbery 9008 Other 8614 Missing Person 8356 Suspicious Occ 7729 Offences Against The Family And Children 7184 Disorderly Conduct 6716 Drug Offense 6372 Traffic Violation Arrest 4845 Miscellaneous Investigation 3695 Other Offenses 2902 Weapons Offense 2200 Stolen Property 2008 Case Closure 1641 Forgery And Counterfeiting 1592 Weapons Carrying Etc 1454 Courtesy Report 1145 Sex Offense 1133 Arson 1038 Name: Category, dtype: int64
=> Do dữ liệu khá nhiều nên có thể dùng under sampling để cân bằng các nhãn
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(n_jobs=-1)
X_resampled, Y_resampled = cc.fit_resample(
data_new.drop('Category', axis=1),
data_new[['Category']]
)
Y_resampled.value_counts()
Category Weapons Offense 1038 Missing Person 1038 Assault 1038 Burglary 1038 Case Closure 1038 Courtesy Report 1038 Disorderly Conduct 1038 Drug Offense 1038 Forgery And Counterfeiting 1038 Fraud 1038 Larceny Theft 1038 Lost Property 1038 Malicious Mischief 1038 Miscellaneous Investigation 1038 Motor Vehicle Theft 1038 Weapons Carrying Etc 1038 Non-Criminal 1038 Offences Against The Family And Children 1038 Other 1038 Other Miscellaneous 1038 Other Offenses 1038 Recovered Vehicle 1038 Robbery 1038 Sex Offense 1038 Stolen Property 1038 Suspicious Occ 1038 Traffic Violation Arrest 1038 Warrant 1038 Arson 1038 dtype: int64
del raw_data, data, data_new # Giải phóng bộ nhớ
X_resampled
| Weekday | Latitude | Longitude | Holiday | BusinessHour | Year | Month | Day | Hour | Minute | ... | Central | Ingleside | Mission | Northern | Out of SF | Park | Richmond | Southern | Taraval | Tenderloin | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3 | 37.790788 | -122.419036 | 0 | 0 | 2020 | 8 | 16 | 1 | 32 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 3 | 37.782057 | -122.427396 | 0 | 1 | 2020 | 8 | 16 | 14 | 34 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 2 | 37.711317 | -122.422241 | 0 | 1 | 2020 | 8 | 15 | 15 | 5 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 3 | 37.782057 | -122.427396 | 0 | 1 | 2020 | 8 | 16 | 17 | 9 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 5 | 37.785790 | -122.412970 | 0 | 0 | 2020 | 8 | 18 | 7 | 40 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 30097 | 1 | 37.765183 | -122.417487 | 0 | 0 | 2020 | 3 | 2 | 18 | 30 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 30098 | 1 | 37.784323 | -122.404509 | 0 | 1 | 2018 | 11 | 1 | 16 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 30099 | 3 | 37.752204 | -122.419018 | 0 | 0 | 2020 | 1 | 19 | 22 | 12 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 30100 | 6 | 37.727717 | -122.432363 | 0 | 1 | 2019 | 4 | 17 | 15 | 22 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 30101 | 4 | 37.755869 | -122.399984 | 0 | 0 | 2019 | 6 | 29 | 5 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
30102 rows × 23 columns
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
models = [
LogisticRegression(multi_class='multinomial', n_jobs=-1),
DecisionTreeClassifier(criterion='entropy'),
RandomForestClassifier(criterion='entropy', n_jobs=-1),
KNeighborsClassifier(n_neighbors=5, n_jobs=-1),
GaussianNB(),
SVC(),
]
import time
from sklearn.model_selection import cross_val_score, KFold
CV = KFold(n_splits=5, shuffle=True, random_state=0)
entries = []
for model in models:
model_name = model.__class__.__name__
begin = time.time()
accuracies = cross_val_score(
model,
X_resampled,
np.ravel(Y_resampled),
scoring = 'accuracy',
cv = CV,
n_jobs = -1
)
finish = time.time()
print(model_name, 'accuracies:', accuracies)
entries.append([model_name, finish - begin, sum(accuracies) / len(accuracies)])
del models
LogisticRegression accuracies: [0.05464209 0.05298123 0.05166113 0.05963455 0.05946844] DecisionTreeClassifier accuracies: [0.12705531 0.13220395 0.13820598 0.14352159 0.14136213] RandomForestClassifier accuracies: [0.19681116 0.19614682 0.19651163 0.20448505 0.19916944] KNeighborsClassifier accuracies: [0.01145989 0.01328683 0.01245847 0.01262458 0.01312292] GaussianNB accuracies: [0.11343631 0.12306926 0.09800664 0.10714286 0.13504983] SVC accuracies: [0.03105796 0.02989537 0.03106312 0.03089701 0.03122924]
cv_df = pd.DataFrame(entries, columns=['model_name', 'time', 'accuracy'])
cv_df
| model_name | time | accuracy | |
|---|---|---|---|
| 0 | LogisticRegression | 14.667475 | 0.055677 |
| 1 | DecisionTreeClassifier | 1.516156 | 0.136470 |
| 2 | RandomForestClassifier | 4.909912 | 0.198625 |
| 3 | KNeighborsClassifier | 1.586752 | 0.012591 |
| 4 | GaussianNB | 0.235637 | 0.115341 |
| 5 | SVC | 44.031779 | 0.030829 |
plt.figure(figsize=(15, 7))
plt.subplot(1, 2, 1)
plt.bar(cv_df['model_name'], cv_df['accuracy'])
plt.ylabel('accuracy')
plt.xticks(rotation=45)
plt.title("Độ chính xác")
plt.subplot(1, 2, 2)
plt.bar(cv_df['model_name'], cv_df['time'])
plt.ylabel('time')
plt.xticks(rotation=45)
plt.title("Thời gian thực thi")
plt.show()
=> Sử dụng RandomForestClassifier vì có độ chính xác cao nhất
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, Y_resampled, test_size=0.2)
from sklearn.model_selection import GridSearchCV
param_dist = {
'max_depth' : [70, 90, None],
'n_estimators' : [100, 200, 300],
'max_features' : ['auto', 'sqrt', 'log2'],
# "min_samples_split" : [3, 4, 5],
# "min_samples_leaf" : [2, 5, 10],
'bootstrap' : [True, False],
}
grid = GridSearchCV(
estimator = RandomForestClassifier(criterion='entropy', n_jobs=-1),
param_grid = param_dist,
cv = CV,
n_jobs = -1, # Giảm thời gian tính toán
verbose = 2,
)
grid.fit(X_train, np.ravel(y_train))
best_model = grid.best_estimator_
best_model
Fitting 5 folds for each of 54 candidates, totalling 270 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers. [Parallel(n_jobs=-1)]: Done 1 tasks | elapsed: 19.7s [Parallel(n_jobs=-1)]: Done 122 tasks | elapsed: 2.7min [Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed: 7.0min finished
RandomForestClassifier(criterion='entropy', max_features='sqrt',
n_estimators=300, n_jobs=-1)
print("Độ chính xác cao nhất:", grid.best_score_)
print("Các tham số tốt nhất cho mô hình:", grid.best_params_)
Độ chính xác cao nhất: 0.1988291657384526
Các tham số tốt nhất cho mô hình: {'bootstrap': True, 'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 300}
cv_results = grid.cv_results_
pd.DataFrame(cv_results).head()
| mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_bootstrap | param_max_depth | param_max_features | param_n_estimators | params | split0_test_score | split1_test_score | split2_test_score | split3_test_score | split4_test_score | mean_test_score | std_test_score | rank_test_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6.344469 | 5.318649 | 13.852575 | 4.758601 | True | 70 | auto | 100 | {'bootstrap': True, 'max_depth': 70, 'max_feat... | 0.193066 | 0.197467 | 0.185839 | 0.198297 | 0.197674 | 0.194469 | 0.004698 | 21 |
| 1 | 17.496710 | 0.282765 | 3.816190 | 0.180835 | True | 70 | auto | 200 | {'bootstrap': True, 'max_depth': 70, 'max_feat... | 0.195142 | 0.199751 | 0.189161 | 0.201827 | 0.194145 | 0.196005 | 0.004449 | 11 |
| 2 | 20.359822 | 0.726647 | 3.161411 | 2.352581 | True | 70 | auto | 300 | {'bootstrap': True, 'max_depth': 70, 'max_feat... | 0.197426 | 0.196844 | 0.189369 | 0.205565 | 0.195598 | 0.196960 | 0.005172 | 6 |
| 3 | 13.675345 | 3.873482 | 7.158819 | 3.627583 | True | 70 | sqrt | 100 | {'bootstrap': True, 'max_depth': 70, 'max_feat... | 0.191613 | 0.196844 | 0.193522 | 0.196221 | 0.197467 | 0.195133 | 0.002216 | 17 |
| 4 | 10.371213 | 4.274476 | 13.532997 | 3.288205 | True | 70 | sqrt | 200 | {'bootstrap': True, 'max_depth': 70, 'max_feat... | 0.194519 | 0.197674 | 0.187915 | 0.199336 | 0.194975 | 0.194884 | 0.003907 | 18 |
mean_list = ['mean_fit_time', 'mean_test_score', 'std_test_score']
mean_results = [cv_results[result].mean() for result in mean_list]
pd.DataFrame({ 'cv_results': mean_list, 'mean' : mean_results })
| cv_results | mean | |
|---|---|---|
| 0 | mean_fit_time | 20.148450 |
| 1 | mean_test_score | 0.192262 |
| 2 | std_test_score | 0.003592 |
print('Độ chính xác trên tập test:', best_model.score(X_test, y_test))
y_pred = best_model.predict(X_test)
df = pd.DataFrame({
'Actual' : y_test['Category'],
'Prediction' : y_pred
})
df.head()
Độ chính xác trên tập test: 0.20245806344461054
| Actual | Prediction | |
|---|---|---|
| 13029 | Miscellaneous Investigation | Suspicious Occ |
| 24065 | Stolen Property | Drug Offense |
| 24510 | Stolen Property | Miscellaneous Investigation |
| 25608 | Suspicious Occ | Drug Offense |
| 27026 | Warrant | Assault |
pd.DataFrame(best_model.predict_proba(X_test))
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.006667 | 0.000000 | 0.040000 | 0.041667 | 0.020000 | 0.056667 | 0.036667 | 0.043333 | 0.060000 | 0.023333 | ... | 0.033333 | 0.000000 | 0.036667 | 0.006667 | 0.033333 | 0.123333 | 0.026667 | 0.010000 | 0.030000 | 0.035000 |
| 1 | 0.000000 | 0.023333 | 0.023333 | 0.016667 | 0.000000 | 0.020000 | 0.243333 | 0.013333 | 0.060000 | 0.003333 | ... | 0.110000 | 0.000000 | 0.010000 | 0.003333 | 0.090000 | 0.023333 | 0.030000 | 0.070000 | 0.003333 | 0.043333 |
| 2 | 0.006667 | 0.000000 | 0.003333 | 0.010000 | 0.003333 | 0.070000 | 0.033333 | 0.010000 | 0.016667 | 0.000000 | ... | 0.110000 | 0.063333 | 0.036667 | 0.010000 | 0.073333 | 0.006667 | 0.033333 | 0.020000 | 0.026667 | 0.073333 |
| 3 | 0.003333 | 0.036667 | 0.090000 | 0.000000 | 0.003333 | 0.033333 | 0.156667 | 0.006667 | 0.053333 | 0.013333 | ... | 0.013333 | 0.020000 | 0.026667 | 0.000000 | 0.020000 | 0.030000 | 0.070000 | 0.130000 | 0.006667 | 0.006667 |
| 4 | 0.000000 | 0.170000 | 0.030000 | 0.000000 | 0.000000 | 0.170000 | 0.046667 | 0.000000 | 0.040000 | 0.000000 | ... | 0.026667 | 0.053333 | 0.060000 | 0.000000 | 0.080000 | 0.020000 | 0.026667 | 0.060000 | 0.006667 | 0.006667 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6016 | 0.003333 | 0.030000 | 0.103333 | 0.006667 | 0.010000 | 0.010000 | 0.036667 | 0.003333 | 0.033333 | 0.000000 | ... | 0.076667 | 0.016667 | 0.093333 | 0.003333 | 0.036667 | 0.073333 | 0.016667 | 0.040000 | 0.006667 | 0.006667 |
| 6017 | 0.000000 | 0.033333 | 0.073333 | 0.000000 | 0.000000 | 0.026667 | 0.026667 | 0.000000 | 0.103333 | 0.013333 | ... | 0.003333 | 0.046667 | 0.026667 | 0.000000 | 0.003333 | 0.053333 | 0.030000 | 0.016667 | 0.000000 | 0.010000 |
| 6018 | 0.000000 | 0.016667 | 0.046667 | 0.000000 | 0.000000 | 0.046667 | 0.130000 | 0.013333 | 0.040000 | 0.010000 | ... | 0.016667 | 0.003333 | 0.190000 | 0.000000 | 0.010000 | 0.100000 | 0.003333 | 0.060000 | 0.030000 | 0.010000 |
| 6019 | 0.003333 | 0.010000 | 0.023333 | 0.006667 | 0.000000 | 0.030000 | 0.120000 | 0.003333 | 0.026667 | 0.000000 | ... | 0.020000 | 0.016667 | 0.043333 | 0.000000 | 0.186667 | 0.056667 | 0.056667 | 0.143333 | 0.013333 | 0.043333 |
| 6020 | 0.000000 | 0.093333 | 0.046667 | 0.000000 | 0.000000 | 0.020000 | 0.023333 | 0.000000 | 0.053333 | 0.253333 | ... | 0.000000 | 0.003333 | 0.083333 | 0.000000 | 0.013333 | 0.043333 | 0.033333 | 0.030000 | 0.000000 | 0.023333 |
6021 rows × 29 columns
from sklearn.metrics import confusion_matrix
cf_matrix = confusion_matrix(y_test, y_pred)
fig = plt.figure(figsize=(17, 10))
sns.heatmap(cf_matrix, annot=True, annot_kws={"size": 12})
plt.show()
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
precision recall f1-score support
Arson 0.31 0.46 0.37 222
Assault 0.12 0.19 0.15 203
Burglary 0.22 0.28 0.24 219
Case Closure 0.19 0.19 0.19 183
Courtesy Report 0.77 0.68 0.72 216
Disorderly Conduct 0.03 0.02 0.02 214
Drug Offense 0.22 0.27 0.24 226
Forgery And Counterfeiting 0.19 0.19 0.19 225
Fraud 0.10 0.08 0.09 210
Larceny Theft 0.38 0.77 0.51 192
Lost Property 0.20 0.21 0.21 201
Malicious Mischief 0.12 0.10 0.11 231
Miscellaneous Investigation 0.12 0.10 0.11 198
Missing Person 0.11 0.10 0.11 210
Motor Vehicle Theft 0.22 0.30 0.25 196
Non-Criminal 0.11 0.14 0.13 209
Offences Against The Family And Children 0.09 0.06 0.07 208
Other 0.08 0.05 0.06 209
Other Miscellaneous 0.18 0.25 0.21 208
Other Offenses 0.07 0.04 0.05 202
Recovered Vehicle 0.18 0.25 0.21 202
Robbery 0.06 0.06 0.06 190
Sex Offense 0.44 0.46 0.45 197
Stolen Property 0.11 0.09 0.10 194
Suspicious Occ 0.08 0.04 0.05 201
Traffic Violation Arrest 0.08 0.04 0.05 218
Warrant 0.12 0.10 0.11 220
Weapons Carrying Etc 0.24 0.22 0.23 221
Weapons Offense 0.18 0.15 0.17 196
accuracy 0.20 6021
macro avg 0.18 0.20 0.19 6021
weighted avg 0.18 0.20 0.19 6021
import pickle
pkl_filename = "model.pkl"
with open(pkl_filename, 'wb') as file:
pickle.dump(best_model, file)
with open(pkl_filename, 'rb') as file:
model = pickle.load(file)